Lab 7

Author

Amren Hossain

Code
library(tidyverse)
library(lubridate)
Code
 download.file(url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", 
               destfile = "data/time_series_covid19_confirmed_global.csv")
Code
time_series_confirmed <- read_csv("/home/ashossain_umass_edu/Lab_7/time_series_covid19_confirmed_global.csv")|>
  rename(Province_State = "Province/State", Country_Region = "Country/Region")

time_series_confirmed_long <- time_series_confirmed |> 
               pivot_longer(-c(Province_State, Country_Region, Lat, Long),
                            names_to = "Date", values_to = "Confirmed") 

time_series_confirmed_long$Date <- mdy(time_series_confirmed_long$Date)
time_series_confirmed_long|> 
  group_by(Country_Region, Date) |> 
  summarise(Confirmed = sum(Confirmed)) |> 
  filter (Country_Region == "US") |> 
  ggplot(aes(x = Date,  y = Confirmed)) + 
    geom_point() +
    geom_line() +
    ggtitle("US COVID-19 Confirmed Cases")

Code
time_series_confirmed_long |> 
    group_by(Country_Region, Date) |> 
    summarise(Confirmed = sum(Confirmed)) |> 
    filter (Country_Region %in% c("China","France","Italy", 
                                "Korea, South", "US")) |> 
    ggplot(aes(x = Date,  y = Confirmed, color = Country_Region)) + 
      geom_point() +
      geom_line() +
      ggtitle("COVID-19 Confirmed Cases")

Code
time_series_confirmed_long_daily <-time_series_confirmed_long |> 
    group_by(Country_Region, Date) |> 
    summarise(Confirmed = sum(Confirmed)) |> 
    mutate(Daily = Confirmed - lag(Confirmed, default = first(Confirmed )))
Code
time_series_confirmed_long_daily |> 
    filter (Country_Region == "US") |> 
    ggplot(aes(x = Date,  y = Daily, color = Country_Region)) + 
      geom_point() +
      ggtitle("COVID-19 Confirmed Cases")

Code
time_series_confirmed_long_daily |> 
    filter (Country_Region == "US") |> 
    ggplot(aes(x = Date,  y = Daily, color = Country_Region)) + 
      geom_line() +
      ggtitle("COVID-19 Confirmed Cases")

Code
time_series_confirmed_long_daily |> 
    filter (Country_Region == "US") |> 
    ggplot(aes(x = Date,  y = Daily, color = Country_Region)) + 
      geom_smooth() +
      ggtitle("COVID-19 Confirmed Cases")

Code
time_series_confirmed_long_daily |> 
    filter (Country_Region == "US") |> 
    ggplot(aes(x = Date,  y = Daily, color = Country_Region)) + 
      geom_smooth(method = "gam", se = FALSE) +
      ggtitle("COVID-19 Confirmed Cases")

Animated Graphs

Code
library("gganimate")
library("gifski")
theme_set(theme_bw())

time_series_deaths_confirmed <- read_csv("/home/ashossain_umass_edu/Lab_7/time_series_covid19_deaths_global.csv")|>
  rename(Province_State = "Province/State", Country_Region = "Country/Region")

time_series_deaths_long <- time_series_deaths_confirmed |> 
    pivot_longer(-c(Province_State, Country_Region, Lat, Long),
        names_to = "Date", values_to = "Confirmed") 

time_series_deaths_long$Date <- mdy(time_series_deaths_long$Date)

daily_counts <- time_series_confirmed_long_daily |> 
      filter (Country_Region == "US")

p <- ggplot(daily_counts, aes(x = Date,  y = Daily, color = Country_Region)) + 
        geom_point() +
        ggtitle("Confirmed COVID-19 Cases") +
# gganimate lines  
        geom_point(aes(group = seq_along(Date))) +
        transition_reveal(Date)

# make the animation
anim <- animate(p, renderer = gifski_renderer(), end_pause = 15)
 
anim_save("daily_counts_US.gif", animation = anim)

Exercises

Exercise 1

Code
library(tidyverse)

table1
# A tibble: 6 × 4
  country      year  cases population
  <chr>       <dbl>  <dbl>      <dbl>
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583
Code
table2
# A tibble: 12 × 4
   country      year type            count
   <chr>       <dbl> <chr>           <dbl>
 1 Afghanistan  1999 cases             745
 2 Afghanistan  1999 population   19987071
 3 Afghanistan  2000 cases            2666
 4 Afghanistan  2000 population   20595360
 5 Brazil       1999 cases           37737
 6 Brazil       1999 population  172006362
 7 Brazil       2000 cases           80488
 8 Brazil       2000 population  174504898
 9 China        1999 cases          212258
10 China        1999 population 1272915272
11 China        2000 cases          213766
12 China        2000 population 1280428583
Code
table3
# A tibble: 6 × 3
  country      year rate             
  <chr>       <dbl> <chr>            
1 Afghanistan  1999 745/19987071     
2 Afghanistan  2000 2666/20595360    
3 Brazil       1999 37737/172006362  
4 Brazil       2000 80488/174504898  
5 China        1999 212258/1272915272
6 China        2000 213766/1280428583
Code
table1 |>
  mutate(rate = cases / population * 10000)
# A tibble: 6 × 5
  country      year  cases population  rate
  <chr>       <dbl>  <dbl>      <dbl> <dbl>
1 Afghanistan  1999    745   19987071 0.373
2 Afghanistan  2000   2666   20595360 1.29 
3 Brazil       1999  37737  172006362 2.19 
4 Brazil       2000  80488  174504898 4.61 
5 China        1999 212258 1272915272 1.67 
6 China        2000 213766 1280428583 1.67 
Code
table1 |> 
  group_by(year) |> 
  summarize(total_cases = sum(cases))
# A tibble: 2 × 2
   year total_cases
  <dbl>       <dbl>
1  1999      250740
2  2000      296920
Code
ggplot(table1, aes(x = year, y = cases)) +
  geom_line(aes(group = country), color = "grey50") +
  geom_point(aes(color = country, shape = country)) +
  scale_x_continuous(breaks = c(1999, 2000)) # x-axis breaks at 1999 and 2000

Code
billboard |> 
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    values_to = "rank"
  )
# A tibble: 24,092 × 5
   artist track                   date.entered week   rank
   <chr>  <chr>                   <date>       <chr> <dbl>
 1 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk1      87
 2 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk2      82
 3 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk3      72
 4 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk4      77
 5 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk5      87
 6 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk6      94
 7 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk7      99
 8 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk8      NA
 9 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk9      NA
10 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk10     NA
# ℹ 24,082 more rows
Code
billboard |> 
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    values_to = "rank",
    values_drop_na = TRUE
  )
# A tibble: 5,307 × 5
   artist  track                   date.entered week   rank
   <chr>   <chr>                   <date>       <chr> <dbl>
 1 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk1      87
 2 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk2      82
 3 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk3      72
 4 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk4      77
 5 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk5      87
 6 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk6      94
 7 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk7      99
 8 2Ge+her The Hardest Part Of ... 2000-09-02   wk1      91
 9 2Ge+her The Hardest Part Of ... 2000-09-02   wk2      87
10 2Ge+her The Hardest Part Of ... 2000-09-02   wk3      92
# ℹ 5,297 more rows
Code
billboard_longer <- billboard |> 
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    values_to = "rank",
    values_drop_na = TRUE
  ) |> 
  mutate(
    week = parse_number(week)
  )

billboard_longer |> 
  ggplot(aes(x = week, y = rank, group = track)) + 
  geom_line(alpha = 0.25) + 
  scale_y_reverse()

Code
df <- tribble(
  ~id,  ~bp1, ~bp2,
   "A",  100,  120,
   "B",  140,  115,
   "C",  120,  125
)

df |> 
  pivot_longer(
    cols = bp1:bp2,
    names_to = "measurement",
    values_to = "value"
  )
# A tibble: 6 × 3
  id    measurement value
  <chr> <chr>       <dbl>
1 A     bp1           100
2 A     bp2           120
3 B     bp1           140
4 B     bp2           115
5 C     bp1           120
6 C     bp2           125
Code
who2 |> 
  pivot_longer(
    cols = !(country:year),
    names_to = c("diagnosis", "gender", "age"), 
    names_sep = "_",
    values_to = "count"
  )
# A tibble: 405,440 × 6
   country      year diagnosis gender age   count
   <chr>       <dbl> <chr>     <chr>  <chr> <dbl>
 1 Afghanistan  1980 sp        m      014      NA
 2 Afghanistan  1980 sp        m      1524     NA
 3 Afghanistan  1980 sp        m      2534     NA
 4 Afghanistan  1980 sp        m      3544     NA
 5 Afghanistan  1980 sp        m      4554     NA
 6 Afghanistan  1980 sp        m      5564     NA
 7 Afghanistan  1980 sp        m      65       NA
 8 Afghanistan  1980 sp        f      014      NA
 9 Afghanistan  1980 sp        f      1524     NA
10 Afghanistan  1980 sp        f      2534     NA
# ℹ 405,430 more rows
Code
household |> 
  pivot_longer(
    cols = !family, 
    names_to = c(".value", "child"), 
    names_sep = "_", 
    values_drop_na = TRUE
  )
# A tibble: 9 × 4
  family child  dob        name  
   <int> <chr>  <date>     <chr> 
1      1 child1 1998-11-26 Susan 
2      1 child2 2000-01-29 Jose  
3      2 child1 1996-06-22 Mark  
4      3 child1 2002-07-11 Sam   
5      3 child2 2004-04-05 Seth  
6      4 child1 2004-10-10 Craig 
7      4 child2 2009-08-27 Khai  
8      5 child1 2000-12-05 Parker
9      5 child2 2005-02-28 Gracie
Code
cms_patient_experience |> 
  distinct(measure_cd, measure_title)
# A tibble: 6 × 2
  measure_cd   measure_title                                                    
  <chr>        <chr>                                                            
1 CAHPS_GRP_1  CAHPS for MIPS SSM: Getting Timely Care, Appointments, and Infor…
2 CAHPS_GRP_2  CAHPS for MIPS SSM: How Well Providers Communicate               
3 CAHPS_GRP_3  CAHPS for MIPS SSM: Patient's Rating of Provider                 
4 CAHPS_GRP_5  CAHPS for MIPS SSM: Health Promotion and Education               
5 CAHPS_GRP_8  CAHPS for MIPS SSM: Courteous and Helpful Office Staff           
6 CAHPS_GRP_12 CAHPS for MIPS SSM: Stewardship of Patient Resources             
Code
cms_patient_experience |> 
  pivot_wider(
    names_from = measure_cd,
    values_from = prf_rate
  )
# A tibble: 500 × 9
   org_pac_id org_nm           measure_title CAHPS_GRP_1 CAHPS_GRP_2 CAHPS_GRP_3
   <chr>      <chr>            <chr>               <dbl>       <dbl>       <dbl>
 1 0446157747 USC CARE MEDICA… CAHPS for MI…          63          NA          NA
 2 0446157747 USC CARE MEDICA… CAHPS for MI…          NA          87          NA
 3 0446157747 USC CARE MEDICA… CAHPS for MI…          NA          NA          86
 4 0446157747 USC CARE MEDICA… CAHPS for MI…          NA          NA          NA
 5 0446157747 USC CARE MEDICA… CAHPS for MI…          NA          NA          NA
 6 0446157747 USC CARE MEDICA… CAHPS for MI…          NA          NA          NA
 7 0446162697 ASSOCIATION OF … CAHPS for MI…          59          NA          NA
 8 0446162697 ASSOCIATION OF … CAHPS for MI…          NA          85          NA
 9 0446162697 ASSOCIATION OF … CAHPS for MI…          NA          NA          83
10 0446162697 ASSOCIATION OF … CAHPS for MI…          NA          NA          NA
# ℹ 490 more rows
# ℹ 3 more variables: CAHPS_GRP_5 <dbl>, CAHPS_GRP_8 <dbl>, CAHPS_GRP_12 <dbl>
Code
df <- tribble(
  ~id, ~measurement, ~value,
  "A",        "bp1",    100,
  "A",        "bp1",    102,
  "A",        "bp2",    120,
  "B",        "bp1",    140, 
  "B",        "bp2",    115
)

df |> 
  pivot_wider(
    names_from = measurement,
    values_from = value
  )
# A tibble: 2 × 3
  id    bp1       bp2      
  <chr> <list>    <list>   
1 A     <dbl [2]> <dbl [1]>
2 B     <dbl [1]> <dbl [1]>
Code
df |> 
  select(-measurement, -value) |> 
  distinct() |> 
  mutate(x = NA, y = NA, z = NA)
# A tibble: 2 × 4
  id    x     y     z    
  <chr> <lgl> <lgl> <lgl>
1 A     NA    NA    NA   
2 B     NA    NA    NA   
Code
df |>
  pivot_wider(
    names_from = measurement,
    values_from = value
  )
# A tibble: 2 × 3
  id    bp1       bp2      
  <chr> <list>    <list>   
1 A     <dbl [2]> <dbl [1]>
2 B     <dbl [1]> <dbl [1]>

Exercise 2

Instead of making a graph of 5 countries on the same graph as in the above example, use facet_wrap with scales=“free_y”.

Code
p <- time_series_deaths_long |>
  filter(Country_Region %in% c("US", "Canada", "Mexico", "Brazil", "Egypt", 
                               "Ecuador", "India", "Netherlands", "Germany", "China")) |>
  ggplot(aes(x = Date, y = Confirmed, color = Country_Region)) +
  geom_line() +
  facet_wrap(~ Country_Region, scales = "free_y") +
  transition_reveal(Date) +
  labs(title = "Cumulative Deaths Over Time: {frame_along}", y = "Deaths") +
  theme_minimal()

anim <- animate(p, renderer = gifski_renderer(), end_pause = 15)

anim_save("deaths_by_country_facet.gif", animation = anim)

Exercise 3

Using the daily count of confirmed cases, make a single graph with 5 countries of your choosing.

Code
daily_counts <- time_series_confirmed_long_daily |> 
  filter(Country_Region %in% c("US", "India", "Brazil", "Germany", "Mexico"))

# Step 2: Create the animated plot
p <- ggplot(daily_counts, aes(x = Date, y = Daily, color = Country_Region)) +
  geom_line() +  # Use line to better show daily trends
  labs(
    title = "Daily Confirmed COVID-19 Cases: {frame_along}",
    y = "Daily Cases",
    x = "Date",
    color = "Country"
  ) +
  transition_reveal(Date) +
  theme_minimal()

# Step 3: Animate and save
anim <- animate(p, renderer = gifski_renderer(), end_pause = 15)
anim_save("daily_confirmed_5countries.gif", animation = anim)

Exercise 4

Plot the cumulative deaths in the US, Canada and Mexico (you will need to download time_series_covid19_deaths_global.csv)

Code
deaths_raw <- read.csv("/home/ashossain_umass_edu/Lab_7/time_series_covid19_deaths_global.csv") |>
  rename(Province_State = "Province.State", Country_Region = "Country.Region")

time_series_deaths_long <- time_series_deaths_confirmed |> 
  pivot_longer(
    cols = -c(Province_State, Country_Region, Lat, Long),
    names_to = "Date",
    values_to = "Deaths"
  )

# Convert Date to Date format
time_series_deaths_long$Date <- mdy(time_series_deaths_long$Date)

deaths_summary <- time_series_deaths_long |>
  filter(Country_Region %in% c("US", "Canada", "Mexico")) |>
  group_by(Country_Region, Date) |>
  summarise(Cumulative_Deaths = sum(Deaths), .groups = "drop")

p <- ggplot(deaths_summary, aes(x = Date, y = Cumulative_Deaths, color = Country_Region)) +
  geom_line(size = 1) +
  labs(
    title = "Cumulative COVID-19 Deaths: US, Canada, Mexico",
    x = "Date",
    y = "Cumulative Deaths",
    color = "Country"
  )

p

Exercise 5

Make a graph with the countries of your choice using the daily deaths data

Code
deaths_raw <- read.csv("/home/ashossain_umass_edu/Lab_7/time_series_covid19_deaths_global.csv") |>
  rename(Province_State = "Province.State", Country_Region = "Country.Region")

time_series_deaths_long <- time_series_deaths_confirmed |> 
  pivot_longer(
    cols = -c(Province_State, Country_Region, Lat, Long),
    names_to = "Date",
    values_to = "Deaths"
  )

# Convert Date to Date format
time_series_deaths_long$Date <- mdy(time_series_deaths_long$Date)

deaths_summary <- time_series_deaths_long |>
  filter(Country_Region %in% c("China", "Latvia", "Albania")) |>
  group_by(Country_Region, Date) |>
  summarise(Cumulative_Deaths = sum(Deaths), .groups = "drop")

ggplot(deaths_summary, aes(x = Date, y = Cumulative_Deaths, color = Country_Region)) +
  geom_line(size = 1) +
  labs(
    title = "Cumulative COVID-19 Deaths: China, Latvia, Albania",
    x = "Date",
    y = "Cumulative Deaths",
    color = "Country"
  )

Exercise 6

Make an animation of your choosing (do not use a graph with geom_smooth)

Code
deaths_data <- read_csv("/home/ashossain_umass_edu/Lab_7/time_series_covid19_deaths_global.csv") |>
  rename(Province_State = "Province/State", Country_Region = "Country/Region")

# Convert to long format
deaths_long <- deaths_data |>
  pivot_longer(
    cols = -c(Province_State, Country_Region, Lat, Long),
    names_to = "Date",
    values_to = "Cumulative_Deaths"
  ) |>
  mutate(Date = mdy(Date))


daily_deaths <- deaths_long |>
  group_by(Country_Region, Date) |>
  summarise(Cumulative_Deaths = sum(Cumulative_Deaths), .groups = "drop") |>
  group_by(Country_Region) |>
  arrange(Date) |>
  mutate(Daily_Deaths = Cumulative_Deaths - lag(Cumulative_Deaths)) |>
  ungroup()


selected_countries <- c("US", "India", "Brazil", "Germany", "Canada")

daily_deaths_filtered <- daily_deaths |>
  filter(Country_Region %in% selected_countries, !is.na(Daily_Deaths))

p <- ggplot(daily_deaths_filtered, aes(x = Date, y = Daily_Deaths, color = Country_Region)) +
  geom_line(size = 1) +
  labs(
    title = "Daily COVID-19 Deaths Over Time",
    subtitle = "Date: {frame_along}",
    x = "Date",
    y = "Daily Deaths",
    color = "Country"
  ) +
  transition_reveal(Date) +
  theme_minimal()

anim <- animate(p, renderer = gifski_renderer(), end_pause = 15)
anim_save("daily_deaths_animation.gif", animation = anim)